# import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mn #visualizing missing data
import plotly.express as px
import plotly.io as pio
# This ensures Plotly output works in multiple places:
# plotly_mimetype: VS Code notebook UI
# notebook: "Jupyter: Export to HTML" command in VS Code
# See https://plotly.com/python/renderers/#multiple-renderers
pio.renderers.default = "notebook"
pio.templates.default = "simple_white"
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
nyc_df = pd.read_csv("311_Service_Requests_from_2010_to_Present.csv")
nyc_df.head()
| Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Zip | Incident Address | ... | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Garage Lot Name | Ferry Direction | Ferry Terminal Name | Latitude | Longitude | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32310363 | 12/31/2015 11:59:45 PM | 01/01/2016 12:55:15 AM | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | 10034.0 | 71 VERMILYEA AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.865682 | -73.923501 | (40.86568153633767, -73.92350095571744) |
| 1 | 32309934 | 12/31/2015 11:59:44 PM | 01/01/2016 01:26:57 AM | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 11105.0 | 27-07 23 AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.775945 | -73.915094 | (40.775945312321085, -73.91509393898605) |
| 2 | 32309159 | 12/31/2015 11:59:29 PM | 01/01/2016 04:51:03 AM | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 10458.0 | 2897 VALENTINE AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.870325 | -73.888525 | (40.870324522111424, -73.88852464418646) |
| 3 | 32305098 | 12/31/2015 11:57:46 PM | 01/01/2016 07:43:13 AM | NYPD | New York City Police Department | Illegal Parking | Commercial Overnight Parking | Street/Sidewalk | 10461.0 | 2940 BAISLEY AVENUE | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.835994 | -73.828379 | (40.83599404683083, -73.82837939584206) |
| 4 | 32306529 | 12/31/2015 11:56:58 PM | 01/01/2016 03:24:42 AM | NYPD | New York City Police Department | Illegal Parking | Blocked Sidewalk | Street/Sidewalk | 11373.0 | 87-14 57 ROAD | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 40.733060 | -73.874170 | (40.733059618956815, -73.87416975810375) |
5 rows × 53 columns
#color palette for this notebook
colors = ["#5f8e00","#84c600","#d6ecaa","#1c335e","#2ec7ab","#b9ece3",\
"#e8ac65","#c7006a","#ff1f62","#ffb4cb","#ff8214"]
palette = sns.color_palette(palette = colors)
sns.palplot(palette, size =1)
plt.show()
'''
It could be seen that, for numerical atrributes:
`School or Citywide Complaint`, `Verhicle Type` , `Taxi Company Borough`, `Taxi Pick Up Location` and `Garage Lot Name` are all missing values
with their count of row equals to 0 and all other statistical value are NaN.
Hence, we should dropna for these 5 columns.
'''
nyc_df.describe()
| Unique Key | Incident Zip | X Coordinate (State Plane) | Y Coordinate (State Plane) | School or Citywide Complaint | Vehicle Type | Taxi Company Borough | Taxi Pick Up Location | Garage Lot Name | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3.645580e+05 | 361560.000000 | 3.605280e+05 | 360528.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 360528.000000 | 360528.000000 |
| mean | 3.106595e+07 | 10858.496659 | 1.005043e+06 | 203425.305782 | NaN | NaN | NaN | NaN | NaN | 40.724980 | -73.924946 |
| std | 7.331531e+05 | 578.263114 | 2.196362e+04 | 29842.192857 | NaN | NaN | NaN | NaN | NaN | 0.081907 | 0.079213 |
| min | 2.960737e+07 | 83.000000 | 9.133570e+05 | 121185.000000 | NaN | NaN | NaN | NaN | NaN | 40.499040 | -74.254937 |
| 25% | 3.049938e+07 | 10314.000000 | 9.919460e+05 | 182945.000000 | NaN | NaN | NaN | NaN | NaN | 40.668742 | -73.972253 |
| 50% | 3.108795e+07 | 11209.000000 | 1.003470e+06 | 201023.000000 | NaN | NaN | NaN | NaN | NaN | 40.718406 | -73.930643 |
| 75% | 3.167433e+07 | 11238.000000 | 1.019134e+06 | 222790.000000 | NaN | NaN | NaN | NaN | NaN | 40.778166 | -73.874098 |
| max | 3.231065e+07 | 11697.000000 | 1.067186e+06 | 271876.000000 | NaN | NaN | NaN | NaN | NaN | 40.912869 | -73.700715 |
nyc_df.describe(include= "object")
| Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Incident Address | Street Name | Cross Street 1 | ... | School State | School Zip | School Not Found | Bridge Highway Name | Bridge Highway Direction | Road Ramp | Bridge Highway Segment | Ferry Direction | Ferry Terminal Name | Location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 364558 | 362177 | 364558 | 364558 | 364558 | 358057 | 364425 | 312859 | 312859 | 307370 | ... | 364558 | 364557 | 364558 | 297 | 297 | 262 | 262 | 1 | 2 | 360528 |
| unique | 362018 | 339837 | 1 | 3 | 24 | 45 | 18 | 126372 | 7693 | 6234 | ... | 2 | 1 | 1 | 29 | 34 | 2 | 187 | 1 | 2 | 146751 |
| top | 02/11/2015 10:56:36 PM | 09/10/2015 07:12:49 AM | NYPD | New York City Police Department | Blocked Driveway | No Access | Street/Sidewalk | 1207 BEACH AVENUE | BROADWAY | BEND | ... | Unspecified | Unspecified | N | FDR Dr | East/Queens Bound | Roadway | East 96th St (Exit 14) - Triborough Br (Exit 17) | Manhattan Bound | St. George Terminal (Staten Island) | (40.83036235589997, -73.86602154214397) |
| freq | 3 | 3 | 364558 | 364548 | 100881 | 75888 | 301372 | 1014 | 4122 | 5123 | ... | 364557 | 364557 | 364558 | 36 | 26 | 199 | 6 | 1 | 1 | 1012 |
4 rows × 42 columns
'''
We have 53 attributes and 364.588 observations.
'''
nyc_df.shape
(364558, 53)
'''
For object data type, there are 25 columns has null values and 17 completes columns.
'''
nyc_df.select_dtypes('object').isna().any().value_counts()
True 25 False 17 Name: count, dtype: int64
'''
For numeric data type, there are 10 columns has null values and only 1 completes columns.
'''
nyc_df.select_dtypes('number').isna().any().value_counts()
True 10 False 1 Name: count, dtype: int64
'''
We could see many columns are filled with Null Values.
Columns that are totally missing values: Landmark, Bridge Highway Name, Bridge Highway Direction, Road Ramp, Bridge Highway Segment, Ferry Direction Ferry Terminal Name.
--> Total 7 columns
'''
mn.matrix(nyc_df.select_dtypes(include='object'))
<Axes: >
mn.matrix(nyc_df.select_dtypes(include='number'))
<Axes: >
pio.templates
Templates configuration
-----------------------
Default template: 'simple_white'
Available templates:
['ggplot2', 'seaborn', 'simple_white', 'plotly',
'plotly_white', 'plotly_dark', 'presentation', 'xgridoff',
'ygridoff', 'gridon', 'none']
'''
There are 9 columns has the null percentage over 80%
and 4 columns has the null percentage over 10%
--> we will drop these columns.
'''
columns = []
perc_null = []
for column in nyc_df.select_dtypes(include='object'):
columns.append(column)
perc_null.append(round(nyc_df[column].isna().sum()*100/nyc_df.shape[0], 2))
obj_null_calc = pd.DataFrame(list(zip(columns,perc_null)), columns=['Column','Null Percentage'])
#obj_null_calc.head(100)
fig = px.bar(data_frame=obj_null_calc, x='Column', y ='Null Percentage', color='Column', color_discrete_sequence = colors,
title= "[Object] Null percentage per column", text='Null Percentage')
fig.update_layout(showlegend = False)
fig.update_traces(textposition='inside')
fig.show()
numeric_columns = []
numeric_perc_null = []
for col in nyc_df.select_dtypes(include= 'number'):
numeric_columns.append(col)
numeric_perc_null.append(round(nyc_df[col].isna().sum()*100/nyc_df.shape[0],2))
num_null_calc = pd.DataFrame(list(zip(numeric_columns,numeric_perc_null)), columns=['Columns','Null Percentage'])
fig = px.bar(data_frame=num_null_calc, x='Columns', y='Null Percentage', color='Columns', color_discrete_sequence=colors,
title='[Numeric] Null percentage per column', text = 'Null Percentage')
fig.update_traces(textposition='inside')
fig.update_layout(showlegend=False)
fig.show()
'''
Drop columns with greater then >10% null percentage
=> obj + numberic columns = 18 columns will be dropped
'''
limit_null = nyc_df.shape[0]*0.9
nyc_df_clean = nyc_df.dropna(axis=1, thresh= limit_null)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 364558 entries, 0 to 364557 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 364558 non-null int64 1 Created Date 364558 non-null object 2 Closed Date 362177 non-null object 3 Agency 364558 non-null object 4 Agency Name 364558 non-null object 5 Complaint Type 364558 non-null object 6 Descriptor 358057 non-null object 7 Location Type 364425 non-null object 8 Incident Zip 361560 non-null float64 9 Address Type 361306 non-null object 10 City 361561 non-null object 11 Facility Type 362169 non-null object 12 Status 364558 non-null object 13 Due Date 364555 non-null object 14 Resolution Description 364558 non-null object 15 Resolution Action Updated Date 362156 non-null object 16 Community Board 364558 non-null object 17 Borough 364558 non-null object 18 X Coordinate (State Plane) 360528 non-null float64 19 Y Coordinate (State Plane) 360528 non-null float64 20 Park Facility Name 364558 non-null object 21 Park Borough 364558 non-null object 22 School Name 364558 non-null object 23 School Number 364558 non-null object 24 School Region 364557 non-null object 25 School Code 364557 non-null object 26 School Phone Number 364558 non-null object 27 School Address 364558 non-null object 28 School City 364558 non-null object 29 School State 364558 non-null object 30 School Zip 364557 non-null object 31 School Not Found 364558 non-null object 32 Latitude 360528 non-null float64 33 Longitude 360528 non-null float64 34 Location 360528 non-null object dtypes: float64(5), int64(1), object(29) memory usage: 97.3+ MB
'''
Since we have 4 columns described the position of incindent on map, which are lat/long and x/y coordinate.
I would refer to use lat/long then, i will drop these 2 x/y coordinate columns as well as location column (which contains the paired of lat/long).
And I don't need the Incident Zip as well
'''
nyc_df_clean.drop(columns=['X Coordinate (State Plane)','Y Coordinate (State Plane)', 'Location', 'Incident Zip'], inplace=True)
nyc_df_clean.shape
(364558, 31)
mn.dendrogram(nyc_df_clean)
<Axes: >
null_column = pd.Series(nyc_df_clean.isna().sum(), name='Value').to_frame()
null_column
| Value | |
|---|---|
| Unique Key | 0 |
| Created Date | 0 |
| Closed Date | 2381 |
| Agency | 0 |
| Agency Name | 0 |
| Complaint Type | 0 |
| Descriptor | 6501 |
| Location Type | 133 |
| Address Type | 3252 |
| City | 2997 |
| Facility Type | 2389 |
| Status | 0 |
| Due Date | 3 |
| Resolution Description | 0 |
| Resolution Action Updated Date | 2402 |
| Community Board | 0 |
| Borough | 0 |
| Park Facility Name | 0 |
| Park Borough | 0 |
| School Name | 0 |
| School Number | 0 |
| School Region | 1 |
| School Code | 1 |
| School Phone Number | 0 |
| School Address | 0 |
| School City | 0 |
| School State | 0 |
| School Zip | 1 |
| School Not Found | 0 |
| Latitude | 4030 |
| Longitude | 4030 |
'''
For `Descriptor`, `Location Type`, `Address Type`, `City`, `Facility Type` column, i will fill the null with 'Unspecified'
For `Longitude` and `Latitude` , we will drop null rows
For `Closed Date` and `Due Date` and `Resolution Action Updated Date` we will replace the null by plus the created date with the maximum duration to resolved based on each complaint type.
For `School Region`, `School Code`, `School Zip` we will fill it with the mode value.
'''
fill_unspecified_list = ['Descriptor', 'Location Type', 'Address Type', 'City', 'Facility Type']
for col in fill_unspecified_list:
nyc_df_clean[col] = nyc_df_clean[col].fillna('Unspecified')
'''
We could see that all the columns start with 'School' have the top is unspecified.
And the 'school not found' has only 1 value is 'N' which is not informative.
Hence, we will drop all these columns.
'''
nyc_df_clean.loc[:,nyc_df_clean.columns.str.startswith('School')].describe()
| School Name | School Number | School Region | School Code | School Phone Number | School Address | School City | School State | School Zip | School Not Found | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 364558 | 364558 | 364557 | 364557 | 364558 | 364558 | 364558 | 364558 | 364557 | 364558 |
| unique | 2 | 2 | 1 | 1 | 2 | 2 | 2 | 2 | 1 | 1 |
| top | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | Unspecified | N |
| freq | 364557 | 364557 | 364557 | 364557 | 364557 | 364557 | 364557 | 364557 | 364557 | 364558 |
nyc_df_clean.drop(columns=nyc_df_clean.loc[:,nyc_df_clean.columns.str.startswith('School')].columns.values, inplace=True)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 364558 entries, 0 to 364557 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 364558 non-null int64 1 Created Date 364558 non-null object 2 Closed Date 362177 non-null object 3 Agency 364558 non-null object 4 Agency Name 364558 non-null object 5 Complaint Type 364558 non-null object 6 Descriptor 364558 non-null object 7 Location Type 364558 non-null object 8 Address Type 364558 non-null object 9 City 364558 non-null object 10 Facility Type 364558 non-null object 11 Status 364558 non-null object 12 Due Date 364555 non-null object 13 Resolution Description 364558 non-null object 14 Resolution Action Updated Date 362156 non-null object 15 Community Board 364558 non-null object 16 Borough 364558 non-null object 17 Park Facility Name 364558 non-null object 18 Park Borough 364558 non-null object 19 Latitude 360528 non-null float64 20 Longitude 360528 non-null float64 dtypes: float64(2), int64(1), object(18) memory usage: 58.4+ MB
nyc_df_clean['Longitude'] = nyc_df_clean['Longitude'].fillna(nyc_df_clean['Longitude'].mode()[0])
nyc_df_clean['Latitude'] = nyc_df_clean['Latitude'].fillna(nyc_df_clean['Latitude'].mode()[0])
nyc_df_clean.loc[:,nyc_df_clean.columns.str.endswith('Date')]
| Created Date | Closed Date | Due Date | Resolution Action Updated Date | |
|---|---|---|---|---|
| 0 | 12/31/2015 11:59:45 PM | 01/01/2016 12:55:15 AM | 01/01/2016 07:59:45 AM | 01/01/2016 12:55:15 AM |
| 1 | 12/31/2015 11:59:44 PM | 01/01/2016 01:26:57 AM | 01/01/2016 07:59:44 AM | 01/01/2016 01:26:57 AM |
| 2 | 12/31/2015 11:59:29 PM | 01/01/2016 04:51:03 AM | 01/01/2016 07:59:29 AM | 01/01/2016 04:51:03 AM |
| 3 | 12/31/2015 11:57:46 PM | 01/01/2016 07:43:13 AM | 01/01/2016 07:57:46 AM | 01/01/2016 07:43:13 AM |
| 4 | 12/31/2015 11:56:58 PM | 01/01/2016 03:24:42 AM | 01/01/2016 07:56:58 AM | 01/01/2016 03:24:42 AM |
| ... | ... | ... | ... | ... |
| 364553 | 01/01/2015 12:04:44 AM | 01/01/2015 10:22:31 AM | 01/01/2015 08:04:44 AM | 01/01/2015 10:22:31 AM |
| 364554 | 01/01/2015 12:04:28 AM | 01/01/2015 02:25:02 AM | 01/01/2015 08:04:28 AM | 01/01/2015 02:25:02 AM |
| 364555 | 01/01/2015 12:01:30 AM | 01/01/2015 12:20:33 AM | 01/01/2015 08:01:30 AM | 01/01/2015 12:20:33 AM |
| 364556 | 01/01/2015 12:01:29 AM | 01/01/2015 02:42:22 AM | 01/01/2015 08:01:29 AM | 01/01/2015 02:42:22 AM |
| 364557 | 01/01/2015 12:00:50 AM | 01/01/2015 02:47:50 AM | 01/01/2015 08:00:50 AM | 01/01/2015 02:47:50 AM |
364558 rows × 4 columns
date_type_column = nyc_df_clean.loc[:, nyc_df_clean.columns.str.endswith('Date')].columns.values
for col in date_type_column:
nyc_df_clean[col] = pd.to_datetime(nyc_df_clean[col], format='%m/%d/%Y %I:%M:%S %p')
#nyc_df_clean[col] = nyc_df_clean[col].dt.strftime('%m/%d/%Y %H:%M:%S')
nyc_df_clean.loc[:,nyc_df_clean.columns.str.endswith('Date')]
| Created Date | Closed Date | Due Date | Resolution Action Updated Date | |
|---|---|---|---|---|
| 0 | 2015-12-31 23:59:45 | 2016-01-01 00:55:15 | 2016-01-01 07:59:45 | 2016-01-01 00:55:15 |
| 1 | 2015-12-31 23:59:44 | 2016-01-01 01:26:57 | 2016-01-01 07:59:44 | 2016-01-01 01:26:57 |
| 2 | 2015-12-31 23:59:29 | 2016-01-01 04:51:03 | 2016-01-01 07:59:29 | 2016-01-01 04:51:03 |
| 3 | 2015-12-31 23:57:46 | 2016-01-01 07:43:13 | 2016-01-01 07:57:46 | 2016-01-01 07:43:13 |
| 4 | 2015-12-31 23:56:58 | 2016-01-01 03:24:42 | 2016-01-01 07:56:58 | 2016-01-01 03:24:42 |
| ... | ... | ... | ... | ... |
| 364553 | 2015-01-01 00:04:44 | 2015-01-01 10:22:31 | 2015-01-01 08:04:44 | 2015-01-01 10:22:31 |
| 364554 | 2015-01-01 00:04:28 | 2015-01-01 02:25:02 | 2015-01-01 08:04:28 | 2015-01-01 02:25:02 |
| 364555 | 2015-01-01 00:01:30 | 2015-01-01 00:20:33 | 2015-01-01 08:01:30 | 2015-01-01 00:20:33 |
| 364556 | 2015-01-01 00:01:29 | 2015-01-01 02:42:22 | 2015-01-01 08:01:29 | 2015-01-01 02:42:22 |
| 364557 | 2015-01-01 00:00:50 | 2015-01-01 02:47:50 | 2015-01-01 08:00:50 | 2015-01-01 02:47:50 |
364558 rows × 4 columns
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 364558 entries, 0 to 364557 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 364558 non-null int64 1 Created Date 364558 non-null datetime64[ns] 2 Closed Date 362177 non-null datetime64[ns] 3 Agency 364558 non-null object 4 Agency Name 364558 non-null object 5 Complaint Type 364558 non-null object 6 Descriptor 364558 non-null object 7 Location Type 364558 non-null object 8 Address Type 364558 non-null object 9 City 364558 non-null object 10 Facility Type 364558 non-null object 11 Status 364558 non-null object 12 Due Date 364555 non-null datetime64[ns] 13 Resolution Description 364558 non-null object 14 Resolution Action Updated Date 362156 non-null datetime64[ns] 15 Community Board 364558 non-null object 16 Borough 364558 non-null object 17 Park Facility Name 364558 non-null object 18 Park Borough 364558 non-null object 19 Latitude 364558 non-null float64 20 Longitude 364558 non-null float64 dtypes: datetime64[ns](4), float64(2), int64(1), object(14) memory usage: 58.4+ MB
'''
We could seen that the complaint type Ferry Complaint has no Due Date and Closed Date and 1 missing Value on Resolution Action Updated Date
So it is not informative for us, we will drop these rows.
'''
display(nyc_df_clean[nyc_df_clean['Complaint Type']=='Ferry Complaint'][['Complaint Type','Created Date', 'Due Date','Closed Date']])
nyc_df_clean[nyc_df_clean['Complaint Type']=='Ferry Complaint'].info()
| Complaint Type | Created Date | Due Date | Closed Date | |
|---|---|---|---|---|
| 161610 | Ferry Complaint | 2015-08-03 08:28:29 | NaT | NaT |
| 192974 | Ferry Complaint | 2015-07-06 09:12:34 | NaT | NaT |
<class 'pandas.core.frame.DataFrame'> Index: 2 entries, 161610 to 192974 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 2 non-null int64 1 Created Date 2 non-null datetime64[ns] 2 Closed Date 0 non-null datetime64[ns] 3 Agency 2 non-null object 4 Agency Name 2 non-null object 5 Complaint Type 2 non-null object 6 Descriptor 2 non-null object 7 Location Type 2 non-null object 8 Address Type 2 non-null object 9 City 2 non-null object 10 Facility Type 2 non-null object 11 Status 2 non-null object 12 Due Date 0 non-null datetime64[ns] 13 Resolution Description 2 non-null object 14 Resolution Action Updated Date 1 non-null datetime64[ns] 15 Community Board 2 non-null object 16 Borough 2 non-null object 17 Park Facility Name 2 non-null object 18 Park Borough 2 non-null object 19 Latitude 2 non-null float64 20 Longitude 2 non-null float64 dtypes: datetime64[ns](4), float64(2), int64(1), object(14) memory usage: 352.0+ bytes
nyc_df_clean = nyc_df_clean.drop(nyc_df_clean[(nyc_df_clean['Complaint Type'] == 'Ferry Complaint') & (nyc_df_clean['Due Date'].isnull())].index)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'> Index: 364556 entries, 0 to 364557 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 364556 non-null int64 1 Created Date 364556 non-null datetime64[ns] 2 Closed Date 362177 non-null datetime64[ns] 3 Agency 364556 non-null object 4 Agency Name 364556 non-null object 5 Complaint Type 364556 non-null object 6 Descriptor 364556 non-null object 7 Location Type 364556 non-null object 8 Address Type 364556 non-null object 9 City 364556 non-null object 10 Facility Type 364556 non-null object 11 Status 364556 non-null object 12 Due Date 364555 non-null datetime64[ns] 13 Resolution Description 364556 non-null object 14 Resolution Action Updated Date 362155 non-null datetime64[ns] 15 Community Board 364556 non-null object 16 Borough 364556 non-null object 17 Park Facility Name 364556 non-null object 18 Park Borough 364556 non-null object 19 Latitude 364556 non-null float64 20 Longitude 364556 non-null float64 dtypes: datetime64[ns](4), float64(2), int64(1), object(14) memory usage: 61.2+ MB
'''
For the remain row of dude is NaT and resolution action updated date is NaT. We could see that the Closed Date is same day with the Created Date.
So i will fill NaT for Due Date and Resolution Action with the same Closed Date.
'''
nyc_df_clean[nyc_df_clean['Due Date'].isnull()]
| Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Address Type | City | ... | Status | Due Date | Resolution Description | Resolution Action Updated Date | Community Board | Borough | Park Facility Name | Park Borough | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 175921 | 31129107 | 2015-07-21 16:05:39 | 2015-07-21 21:56:38 | NYPD | NYPD | Blocked Driveway | No Access | Unspecified | ADDRESS | BROOKLYN | ... | Draft | NaT | The Police Department responded to the complai... | NaT | 10 BROOKLYN | BROOKLYN | Unspecified | BROOKLYN | 40.625772 | -73.999564 |
1 rows × 21 columns
mask = (nyc_df_clean['Due Date'].isnull()) & (nyc_df_clean['Resolution Action Updated Date'].isnull())
nyc_df_clean.loc[mask, ['Due Date', 'Resolution Action Updated Date']] = nyc_df_clean.loc[mask, 'Closed Date']
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'> Index: 364556 entries, 0 to 364557 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 364556 non-null int64 1 Created Date 364556 non-null datetime64[ns] 2 Closed Date 362177 non-null datetime64[ns] 3 Agency 364556 non-null object 4 Agency Name 364556 non-null object 5 Complaint Type 364556 non-null object 6 Descriptor 364556 non-null object 7 Location Type 364556 non-null object 8 Address Type 364556 non-null object 9 City 364556 non-null object 10 Facility Type 364556 non-null object 11 Status 364556 non-null object 12 Due Date 364556 non-null datetime64[ns] 13 Resolution Description 364556 non-null object 14 Resolution Action Updated Date 362156 non-null datetime64[ns] 15 Community Board 364556 non-null object 16 Borough 364556 non-null object 17 Park Facility Name 364556 non-null object 18 Park Borough 364556 non-null object 19 Latitude 364556 non-null float64 20 Longitude 364556 non-null float64 dtypes: datetime64[ns](4), float64(2), int64(1), object(14) memory usage: 61.2+ MB
nyc_df_clean[(nyc_df_clean['Closed Date'].isnull()) & (nyc_df_clean['Resolution Action Updated Date'].isnull())]
| Unique Key | Created Date | Closed Date | Agency | Agency Name | Complaint Type | Descriptor | Location Type | Address Type | City | ... | Status | Due Date | Resolution Description | Resolution Action Updated Date | Community Board | Borough | Park Facility Name | Park Borough | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 416 | 32305700 | 2015-12-31 14:16:04 | NaT | NYPD | New York City Police Department | Illegal Parking | Posted Parking Sign Violation | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-12-31 22:16:04 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| 611 | 32309308 | 2015-12-31 09:58:06 | NaT | NYPD | New York City Police Department | Noise - Street/Sidewalk | Loud Music/Party | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-12-31 17:58:06 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| 1648 | 32303348 | 2015-12-30 05:13:42 | NaT | NYPD | New York City Police Department | Illegal Parking | Commercial Overnight Parking | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-12-30 13:13:42 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| 1816 | 32294519 | 2015-12-29 22:44:50 | NaT | NYPD | New York City Police Department | Derelict Vehicle | With License Plate | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-12-30 06:44:50 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| 1965 | 32296487 | 2015-12-29 19:09:13 | NaT | NYPD | New York City Police Department | Derelict Vehicle | With License Plate | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-12-30 03:09:13 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 363643 | 29618691 | 2015-01-02 11:38:52 | NaT | NYPD | New York City Police Department | Derelict Vehicle | With License Plate | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-01-02 19:38:52 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| 363789 | 29619230 | 2015-01-02 07:25:16 | NaT | NYPD | New York City Police Department | Illegal Parking | Posted Parking Sign Violation | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-01-02 15:25:16 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| 363805 | 29618672 | 2015-01-02 06:14:06 | NaT | NYPD | New York City Police Department | Derelict Vehicle | With License Plate | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-01-02 14:14:06 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| 364151 | 29617561 | 2015-01-01 16:04:35 | NaT | NYPD | New York City Police Department | Illegal Parking | Blocked Hydrant | Street/Sidewalk | Unspecified | Unspecified | ... | Assigned | 2015-01-02 00:04:35 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
| 364349 | 29617257 | 2015-01-01 07:30:10 | NaT | NYPD | New York City Police Department | Illegal Parking | Posted Parking Sign Violation | Street/Sidewalk | Unspecified | Unspecified | ... | Open | 2015-01-01 15:30:10 | Your complaint has been forwarded to the New Y... | NaT | 0 Unspecified | Unspecified | Unspecified | Unspecified | 40.830362 | -73.866022 |
2362 rows × 21 columns
'''
We could seen the closed date is nan because the status hasn't closed yet.
So there is not problem with this column.
But the draft status seems to be useless so i will drop that row.
'''
nyc_df_clean.groupby('Status')['Closed Date'].apply(lambda x: x.isnull().sum())
Status Assigned 776 Closed 0 Draft 1 Open 1602 Name: Closed Date, dtype: int64
nyc_df_clean = nyc_df_clean[nyc_df_clean['Status'] != 'Draft']
nyc_df_clean['Status'].value_counts()
Status Closed 362114 Open 1638 Assigned 802 Name: count, dtype: int64
nyc_df_clean.groupby('Status')['Resolution Action Updated Date'].apply(lambda x: x.isnull().sum())
Status Assigned 760 Closed 2 Open 1637 Name: Resolution Action Updated Date, dtype: int64
''' The Resolution Action was missing for closed status. But there were closed date already so i will replace these missing with it closed date'''
nyc_df_clean[(nyc_df_clean['Status'] == 'Closed') & (nyc_df_clean['Resolution Action Updated Date'].isnull())][['Closed Date','Due Date','Complaint Type','Resolution Description']]
| Closed Date | Due Date | Complaint Type | Resolution Description | |
|---|---|---|---|---|
| 175964 | 2015-07-21 17:24:17 | 2015-07-21 23:15:37 | Illegal Parking | The Police Department responded to the complai... |
| 294151 | 2015-04-05 21:09:42 | 2015-04-06 04:54:44 | Blocked Driveway | Your complaint has been forwarded to the New Y... |
mask2 = (nyc_df_clean['Status'] == 'Closed') & (nyc_df_clean['Resolution Action Updated Date'].isnull())
nyc_df_clean.loc[mask2,'Resolution Action Updated Date'] = nyc_df_clean.loc[mask2, 'Closed Date']
nyc_df_clean['Resolution Description'].value_counts()
Resolution Description The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time. 107051 The Police Department responded to the complaint and took action to fix the condition. 72916 The Police Department responded and upon arrival those responsible for the condition were gone. 71281 The Police Department responded to the complaint and determined that police action was not necessary. 47612 The Police Department issued a summons in response to the complaint. 37342 The Police Department reviewed your complaint and provided additional information below. 16040 Your request can not be processed at this time because of insufficient contact information. Please create a new Service Request on NYC.gov and provide more detailed contact information. 5258 This complaint does not fall under the Police Department's jurisdiction. 2113 Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference. 2108 The Police Department responded to the complaint but officers were unable to gain entry into the premises. 1549 The Police Department responded to the complaint and a report was prepared. 869 Your complaint has been forwarded to the New York Police Department for a non-emergency response. If the police determine the vehicle is illegally parked, they will ticket the vehicle and then you may either contact a private towing company to remove the vehicle or ask your local precinct to contact 'rotation tow'. Any fees charged for towing will have to be paid by the vehicle owner. 311 will have additional information in 8 hours. Please note your service request number for future reference. 254 The Police Department made an arrest in response to the complaint. 151 The New York City Police Department received your comments and forwarded them to the appropriate unit for resolution. You may follow up by calling (646) 610-6952 after 60 days from submitting your agency issue. 8 Your complaint has been received by the Police Department and it has been determined that a long-term investigation may be necessary. Additional information will be available at the conclusion of the investigation. 1 The condition was determined to be an issue appropriate for handling by an alternate entity. The Department of Parks and Recreation has notified the appropriate resource. 1 Name: count, dtype: int64
'''
The 'Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference.'
solution has 100% null values for resolution action updated date.
'''
nyc_df_clean.groupby('Resolution Description')['Resolution Action Updated Date'].apply(lambda x: x.isnull().sum())
Resolution Description The New York City Police Department received your comments and forwarded them to the appropriate unit for resolution. You may follow up by calling (646) 610-6952 after 60 days from submitting your agency issue. 0 The Police Department issued a summons in response to the complaint. 1 The Police Department made an arrest in response to the complaint. 0 The Police Department responded and upon arrival those responsible for the condition were gone. 8 The Police Department responded to the complaint and a report was prepared. 0 The Police Department responded to the complaint and determined that police action was not necessary. 5 The Police Department responded to the complaint and took action to fix the condition. 14 The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time. 8 The Police Department responded to the complaint but officers were unable to gain entry into the premises. 0 The Police Department reviewed your complaint and provided additional information below. 0 The condition was determined to be an issue appropriate for handling by an alternate entity. The Department of Parks and Recreation has notified the appropriate resource. 0 This complaint does not fall under the Police Department's jurisdiction. 0 Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference. 2108 Your complaint has been forwarded to the New York Police Department for a non-emergency response. If the police determine the vehicle is illegally parked, they will ticket the vehicle and then you may either contact a private towing company to remove the vehicle or ask your local precinct to contact 'rotation tow'. Any fees charged for towing will have to be paid by the vehicle owner. 311 will have additional information in 8 hours. Please note your service request number for future reference. 253 Your complaint has been received by the Police Department and it has been determined that a long-term investigation may be necessary. Additional information will be available at the conclusion of the investigation. 0 Your request can not be processed at this time because of insufficient contact information. Please create a new Service Request on NYC.gov and provide more detailed contact information. 0 Name: Resolution Action Updated Date, dtype: int64
mask3 = nyc_df_clean['Resolution Description'] == 'Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference.'
nyc_df_clean.loc[mask3, ['Created Date','Closed Date','Due Date']]
| Created Date | Closed Date | Due Date | |
|---|---|---|---|
| 416 | 2015-12-31 14:16:04 | NaT | 2015-12-31 22:16:04 |
| 611 | 2015-12-31 09:58:06 | NaT | 2015-12-31 17:58:06 |
| 1648 | 2015-12-30 05:13:42 | NaT | 2015-12-30 13:13:42 |
| 1816 | 2015-12-29 22:44:50 | NaT | 2015-12-30 06:44:50 |
| 1965 | 2015-12-29 19:09:13 | NaT | 2015-12-30 03:09:13 |
| ... | ... | ... | ... |
| 363643 | 2015-01-02 11:38:52 | NaT | 2015-01-02 19:38:52 |
| 363789 | 2015-01-02 07:25:16 | NaT | 2015-01-02 15:25:16 |
| 363805 | 2015-01-02 06:14:06 | NaT | 2015-01-02 14:14:06 |
| 364151 | 2015-01-01 16:04:35 | NaT | 2015-01-02 00:04:35 |
| 364349 | 2015-01-01 07:30:10 | NaT | 2015-01-01 15:30:10 |
2108 rows × 3 columns
'''
For these 2 resolution description, we will replace it with the created date + 15 minutes
'''
mask4 = (nyc_df_clean['Resolution Description'] == "Your complaint has been forwarded to the New York Police Department for a non-emergency response. If the police determine the vehicle is illegally parked, they will ticket the vehicle and then you may either contact a private towing company to remove the vehicle or ask your local precinct to contact 'rotation tow'. Any fees charged for towing will have to be paid by the vehicle owner. 311 will have additional information in 8 hours. Please note your service request number for future reference.")\
& (nyc_df_clean['Closed Date'].notnull())
nyc_df_clean.loc[mask4, ['Created Date','Closed Date','Due Date']]
| Created Date | Closed Date | Due Date | |
|---|---|---|---|
| 294151 | 2015-04-05 20:54:44 | 2015-04-05 21:09:42 | 2015-04-06 04:54:44 |
def fill_na_resolution_date(row):
if pd.isnull(row['Resolution Action Updated Date']) and row['Resolution Description'] in ["Your complaint has been forwarded to the New York Police Department for a non-emergency response. 311 will have additional information in 8 hours. Please note your service request number for future reference.",
"Your complaint has been forwarded to the New York Police Department for a non-emergency response. If the police determine the vehicle is illegally parked, they will ticket the vehicle and then you may either contact a private towing company to remove the vehicle or ask your local precinct to contact 'rotation tow'. Any fees charged for towing will have to be paid by the vehicle owner. 311 will have additional information in 8 hours. Please note your service request number for future reference."]:
return row['Created Date'] + pd.Timedelta(minutes=15)
return row['Resolution Action Updated Date']
nyc_df_clean['Resolution Action Updated Date'] = nyc_df_clean.apply(fill_na_resolution_date, axis=1)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'> Index: 364554 entries, 0 to 364557 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 364554 non-null int64 1 Created Date 364554 non-null datetime64[ns] 2 Closed Date 362176 non-null datetime64[ns] 3 Agency 364554 non-null object 4 Agency Name 364554 non-null object 5 Complaint Type 364554 non-null object 6 Descriptor 364554 non-null object 7 Location Type 364554 non-null object 8 Address Type 364554 non-null object 9 City 364554 non-null object 10 Facility Type 364554 non-null object 11 Status 364554 non-null object 12 Due Date 364554 non-null datetime64[ns] 13 Resolution Description 364554 non-null object 14 Resolution Action Updated Date 364518 non-null datetime64[ns] 15 Community Board 364554 non-null object 16 Borough 364554 non-null object 17 Park Facility Name 364554 non-null object 18 Park Borough 364554 non-null object 19 Latitude 364554 non-null float64 20 Longitude 364554 non-null float64 dtypes: datetime64[ns](4), float64(2), int64(1), object(14) memory usage: 61.2+ MB
nyc_df_clean[nyc_df_clean['Resolution Action Updated Date'].isnull()]['Resolution Description'].value_counts()
Resolution Description The Police Department responded to the complaint and took action to fix the condition. 14 The Police Department responded and upon arrival those responsible for the condition were gone. 8 The Police Department responded to the complaint and with the information available observed no evidence of the violation at that time. 8 The Police Department responded to the complaint and determined that police action was not necessary. 5 The Police Department issued a summons in response to the complaint. 1 Name: count, dtype: int64
''' For the remain null value of resolution action updated date, we will replace with the max duration between resolution action updated date and created date
group by each resolution description '''
nyc_df_clean['duration'] = nyc_df_clean['Resolution Action Updated Date'] - nyc_df_clean['Created Date']
max_duration = nyc_df_clean.groupby('Resolution Description')['duration'].max()
def fill_na_with_max_duration(row):
if pd.isnull(row['Resolution Action Updated Date']):
resolution_desc = row['Resolution Description']
max_duration_time = max_duration[resolution_desc]
return row['Created Date'] + max_duration_time
return row['Resolution Action Updated Date']
nyc_df_clean['Resolution Action Updated Date'] = nyc_df_clean.apply(fill_na_with_max_duration, axis = 1)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'> Index: 364554 entries, 0 to 364557 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 364554 non-null int64 1 Created Date 364554 non-null datetime64[ns] 2 Closed Date 362176 non-null datetime64[ns] 3 Agency 364554 non-null object 4 Agency Name 364554 non-null object 5 Complaint Type 364554 non-null object 6 Descriptor 364554 non-null object 7 Location Type 364554 non-null object 8 Address Type 364554 non-null object 9 City 364554 non-null object 10 Facility Type 364554 non-null object 11 Status 364554 non-null object 12 Due Date 364554 non-null datetime64[ns] 13 Resolution Description 364554 non-null object 14 Resolution Action Updated Date 364554 non-null datetime64[ns] 15 Community Board 364554 non-null object 16 Borough 364554 non-null object 17 Park Facility Name 364554 non-null object 18 Park Borough 364554 non-null object 19 Latitude 364554 non-null float64 20 Longitude 364554 non-null float64 21 duration 364518 non-null timedelta64[ns] dtypes: datetime64[ns](4), float64(2), int64(1), object(14), timedelta64[ns](1) memory usage: 64.0+ MB
'''
We could seen that the Park Borough has the same value with Borough, so we just need to keep one of them
Moreover, the Park Facility Name has a huge unspecified value, so we will drop this columnn.
'''
print(nyc_df_clean['Park Facility Name'].value_counts())
print((nyc_df_clean['Borough']==nyc_df_clean['Park Borough']).sum())
Park Facility Name Unspecified 364553 Alley Pond Park - Nature Center 1 Name: count, dtype: int64 364554
nyc_df_clean.drop(columns=['Park Borough','Park Facility Name','duration'], inplace=True)
nyc_df_clean.info()
<class 'pandas.core.frame.DataFrame'> Index: 364554 entries, 0 to 364557 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unique Key 364554 non-null int64 1 Created Date 364554 non-null datetime64[ns] 2 Closed Date 362176 non-null datetime64[ns] 3 Agency 364554 non-null object 4 Agency Name 364554 non-null object 5 Complaint Type 364554 non-null object 6 Descriptor 364554 non-null object 7 Location Type 364554 non-null object 8 Address Type 364554 non-null object 9 City 364554 non-null object 10 Facility Type 364554 non-null object 11 Status 364554 non-null object 12 Due Date 364554 non-null datetime64[ns] 13 Resolution Description 364554 non-null object 14 Resolution Action Updated Date 364554 non-null datetime64[ns] 15 Community Board 364554 non-null object 16 Borough 364554 non-null object 17 Latitude 364554 non-null float64 18 Longitude 364554 non-null float64 dtypes: datetime64[ns](4), float64(2), int64(1), object(12) memory usage: 55.6+ MB
'''
We could see that only one agency value has 3 different Agency Name.
Actually, the Internal Affairs Bureau ("IAB") was set up as a unit within the NYPD that investigates serious police misconduct as well as police corruption.
Hence, we only have one agency here, it is NYDC or New York City Police Department.
So actually, these 2 columns aren't necessary.
We could drop them but i don't mind to keep them in the dataframe.
'''
nyc_df_clean.groupby('Agency')['Agency Name'].value_counts()
Agency Agency Name
NYPD New York City Police Department 364545
Internal Affairs Bureau 8
NYPD 1
Name: count, dtype: int64
complaint_type_counts = nyc_df_clean['Complaint Type'].value_counts()
fig = px.bar(x = complaint_type_counts.index, y = complaint_type_counts.values, color=complaint_type_counts.index, color_discrete_sequence=colors,
title='Which one is the most complaint in NY?')
fig.update_layout(showlegend = False, xaxis_title = 'Complaint Type', yaxis_title='Number of Complaint')
borough_counts = nyc_df_clean.groupby('Borough').size().reset_index(name='Count')
fig = px.pie(borough_counts, values='Count', names='Borough', color_discrete_sequence=colors, title='Complaint Distribution by Borough')
fig.show()
complaint_counts = nyc_df_clean.groupby(['Borough', 'Complaint Type']).size().reset_index(name='Count')
complaint_counts = complaint_counts.sort_values(by=['Borough', 'Count'], ascending=[True, False])
num_boroughs = len(complaint_counts['Borough'].unique())
fig = make_subplots(rows=3, cols=2, subplot_titles=complaint_counts['Borough'].unique())
color_borough = colors[:len(complaint_counts['Borough'].unique())]
for i, (borough, color) in enumerate(zip(complaint_counts['Borough'].unique(), color_borough), 1):
data_borough = complaint_counts[complaint_counts['Borough'] == borough]
fig.add_trace(
go.Bar(x=data_borough['Complaint Type'], y=data_borough['Count'], name=borough, marker=dict(color=color)),
row=(i - 1) // 2 + 1, col=(i - 1) % 2 + 1 # Adjust the row and column indices
)
fig.update_layout(
height=600 * (num_boroughs/2), # Adjust the height of the entire figure based on the number of unique Boroughs
showlegend=False, # Hide the legend
title_text="Number of Complaint Types in each Borough (Sorted by Highest Count)",
title_x=0.5 # Align the main title to the center
)
fig.show()
It looks like the popular type of complaint is blocked driveway and illegal parking. These 2 type of complaints have the highest number of complaint received ticket to NYPD in BRONX, BROOKLYN, QUEENS, AND STATE ISLAND. In MANHATTAN borough, the top 2 issues are all about Noise - Street/Sidewalk and Noise - Commercial.
For Blocked Driveway issue, there is 75% descriptor about 'No Access' Meanwhile, for Illegal parking, most of the complaint is about Posted Parking sign Violation (29%) and Blocked Hydrant (22%)
blocked_driveway_df = nyc_df_clean[nyc_df_clean['Complaint Type'] == "Blocked Driveway"].groupby('Descriptor').size().reset_index(name = 'Count')
illegal_parking_df = nyc_df_clean[nyc_df_clean['Complaint Type'] == "Illegal Parking"].groupby('Descriptor').size().reset_index(name = 'Count')
fig = make_subplots(rows=1, cols=2, subplot_titles=['Blocked Driveway', 'Illegal Parking'], specs=[[{'type':'domain'}, {'type':'domain'}]])
# Add the pie chart for "Blocked Driveway" to the subplot
fig.add_trace(
go.Pie(labels=blocked_driveway_df['Descriptor'], values=blocked_driveway_df['Count'], name='Blocked Driveway'),
row=1, col=1
)
# Add the pie chart for "Illegal Parking" to the subplot
fig.add_trace(
go.Pie(labels=illegal_parking_df['Descriptor'], values=illegal_parking_df['Count'], name='Illegal Parking'),
row=1, col=2
)
fig.update_layout(
title_text="Descriptor Distribution for Blocked Driveway and Illegal Parking",
title_x=0.5
)
fig.update_traces( marker=dict(colors=colors, line=dict(color='#ffffff', width=1)))
fig.show()